require(readxl)
require(skimr)
require(PerformanceAnalytics)
require(factoextra)
require(patchwork)
require(tidyverse); theme_set(theme_bw())
source(file.path('src', 'models', '900_funcionesAlmacenamientoGrafico.R'), encoding = 'UTF-8')
fig_path <- file.path('figures', '011_clasificacion')
data <-
  read_excel(file.path(
    'data',
    'raw',
    'ClasificacionFRE',
    'variablesClasificacionFRE.xlsx'
  ), na = '-') %>% 
  mutate(Departamento...2 = str_to_title(Departamento...2))
## New names:
## * Departamento -> Departamento...1
## * Departamento -> Departamento...2
data %>%
  select(!contains('Departamento')) %>% 
  chart.Correlation(., histogram = TRUE, pch = 19)
Correlación entre variables de PCA

Correlación entre variables de PCA

if (knitr::is_html_output()) {
  skimr::skim(data)
}
Data summary
Name data
Number of rows 33
Number of columns 9
_______________________
Column type frequency:
character 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Departamento…2 0 1 4 40 0 33 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Departamento…1 0 1.00 1.700000e+01 9.670000e+00 1.000000e+00 9.000000e+00 1.700000e+01 2.500000e+01 3.300000e+01 ▇▇▇▇▇
Presupuesto_2020 0 1.00 9.214848e+11 8.500217e+11 1.020000e+11 3.770000e+11 8.160000e+11 1.239000e+12 4.383000e+12 ▇▃▁▁▁
Presupuesto_Salud_2020 1 0.97 1.092351e+11 1.778047e+11 1.021624e+09 3.379498e+10 5.854786e+10 1.053443e+11 9.783884e+11 ▇▁▁▁▁
N_productosCD_2021 0 1.00 3.721000e+01 6.164000e+01 0.000000e+00 0.000000e+00 5.000000e+00 3.300000e+01 2.080000e+02 ▇▁▁▁▁
Cumplimiento_A1_2020-2021-06 0 1.00 9.800000e-01 7.000000e-02 7.500000e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 ▁▁▁▁▇
Cumplimiento_A2_2020-2021-06 2 0.94 5.200000e-01 3.900000e-01 0.000000e+00 1.100000e-01 6.100000e-01 8.600000e-01 1.000000e+00 ▇▂▁▅▇
PropPortafolio 0 1.00 8.400000e-01 1.900000e-01 3.500000e-01 8.200000e-01 9.400000e-01 1.000000e+00 1.000000e+00 ▁▁▁▁▇
No_Inscritos 2 0.94 1.593200e+02 2.144000e+02 1.000000e+00 2.650000e+01 7.800000e+01 2.220000e+02 1.054000e+03 ▇▂▁▁▁

1. Análisis por PCA

pca1 <- data %>%
  drop_na() %>% 
  column_to_rownames('Departamento...2') %>% 
  select(!contains('Departamento')) %>% 
  prcomp(., scale = TRUE, center = TRUE)
plot(pca1, main = 'Distribución de varianza')
Distribución de las varianzas

Distribución de las varianzas

gg1 <- fviz_pca_ind(pca1,
             col.ind = "cos2", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     
) + labs(title = 'PCA de individuos')
gg1
Individuos representados en PCA

Individuos representados en PCA

guardarGGplot(gg1, '001_fig_ind', 8, 6, fig_path)

gg2 <- fviz_pca_var(
  pca1,
  col.var = "contrib",
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE
) + labs(title = 'PCA - Explicación de variables')
gg2
Explicación de variables en PCA

Explicación de variables en PCA

guardarGGplot(gg2, '002_fig_var', 8, 6, fig_path)

clean_data <- data %>%
  drop_na() %>% 
  column_to_rownames('Departamento...2') %>% 
  select(!contains('Departamento'))

2. Análisis de Clústers por Kmeans

funClusters <- function(data, k) {
  data %>%
    kmeans(k)
}
k.values <- data.frame(k = 1:30)
k.values['TWITH'] <- map_dbl(k.values$k, ~funClusters(clean_data, .x)$tot.withinss)

gg3 <- k.values %>% 
  ggplot(aes(x = k, y = TWITH)) + 
  geom_point() + geom_line() + 
  geom_vline(xintercept = 4, lty = 'dashed', col = 'blue3') +
  ylab('Suma de cuadrados \n dentro de clústers') + 
  xlab('N.° de Clusters, k')
gg3
Criterio de codo para clústers por Kmeans

Criterio de codo para clústers por Kmeans

guardarGGplot(gg3, '003_elbowlKmeans', 6, 4, fig_path)


g1 <- funClusters(clean_data, 4) %>%
  fviz_cluster(., data = clean_data) +
  theme_bw() + labs(title = NULL)
g1
Clústers Kmeans visualizados en primeros dos PC

Clústers Kmeans visualizados en primeros dos PC

guardarGGplot(g1, '004_clusterKmeans', 8, 5, fig_path)


g2 <- funClusters(clean_data, 4) %>%
  fviz_cluster(., data = clean_data, axes = c(1, 2)) +
  theme_bw() + labs(title = NULL)
g3 <- funClusters(clean_data, 4) %>%
  fviz_cluster(., data = clean_data, axes = c(1, 3)) +
  theme_bw() + labs(title = NULL)
g4 <- funClusters(clean_data, 4) %>%
  fviz_cluster(., data = clean_data, axes = c(2, 3)) +
  theme_bw() + labs(title = NULL)
g5 <- funClusters(clean_data, 4) %>%
  fviz_cluster(., data = clean_data, axes = c(1, 4)) +
  theme_bw() + labs(title = NULL)


ggt <- wrap_plots(g2, g3, g4, g5)
ggt
Clústers Kmeans visualizados en varios componentes

Clústers Kmeans visualizados en varios componentes

guardarGGplot(ggt, '005_cluz_group', 12, 8, fig_path)

3. Clúster jerárquicos

funClusters_2 <- function(data, k) {
  t1 <- data %>% 
    dist(method = 'euclidean') %>% 
    hclust(method = 'complete')
  
  t2 <- cutree(t1, k)
  return(list(clust = t1, tree = t2))
}
p1 <- plot(funClusters_2(clean_data, 3)$clust, cex = 0.6, hang = -1, ylab = 'Altura',
     main = 'Dendrograma de clúster', xlab = NULL)
Dendrograma de análisis por clústers

Dendrograma de análisis por clústers

pdf(file.path(fig_path, '010_dendrograma.pdf'), width = 12, height = 8)
plot(funClusters_2(clean_data, 3)$clust, cex = 0.6, hang = -1, ylab = 'Altura',
     main = 'Dendrograma de clúster', xlab = NULL)
dev.off()
## png 
##   2
saveRDS(p1, file.path(fig_path, '010_dendrograma.rds'))

gg1 <- funClusters_2(clean_data, 3)$clust$height %>%
  as.tibble() %>%
  add_column(groups = length(funClusters_2(clean_data, 3)$clust$height):1) %>%
  rename(height = value) %>% 
  ggplot(aes(x=groups, y = height)) +
  geom_point() + geom_line() +
  geom_vline(xintercept = 5, lty = 'dashed', col = 'blue3') +
  ylab('Altura') + 
  xlab('N.° de Clusters, k')
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
gg1
Criterio de codo para clústers jerárquicos

Criterio de codo para clústers jerárquicos

guardarGGplot(gg1, '012_elbowlWard', 6, 4, fig_path)

gg2 <- clean_data %>% 
  {fviz_cluster(list(data = ., cluster = funClusters_2(., 5)$tree))} +
  theme_bw() + labs(title = NULL) +
  scale_color_discrete(name = 'Clúster') + 
  scale_shape_discrete(name = 'Clúster') + 
  scale_fill_discrete(name = 'Clúster') 
gg2
Clústers Jerárquicos visualizados en primeros dos componentes

Clústers Jerárquicos visualizados en primeros dos componentes

guardarGGplot(gg2, '013_cluz_group', 8, 5, fig_path)


funClusters_3 <- function(axes) {
  clean_data %>% 
    {fviz_cluster(list(data = ., cluster = funClusters_2(., 5)$tree), 
                  axes = axes, ellipse = T)} +
    theme_bw() + labs(title = NULL) +
    scale_color_discrete(name = 'Clúster') + 
    scale_shape_discrete(name = 'Clúster') + 
    scale_fill_discrete(name = 'Clúster') 
}



g2 <- funClusters_3(c(1,2))
g3 <- funClusters_3(c(1,3))
g4 <- funClusters_3(c(2,3))
g5 <- funClusters_3(c(1,4))


ggt <- wrap_plots(g2, g3, g4, g5)
ggt
Clústers jerarquizados visualizados en varios componentes

Clústers jerarquizados visualizados en varios componentes

guardarGGplot(ggt, '014_cluz_group2', 12, 10, fig_path)
require(plotly)

trans_data <- as_tibble(pca1$x, rownames = 'Departamentos') %>% 
  left_join(funClusters_2(clean_data, 5)$tree %>% 
              as_tibble(rownames = 'Departamentos'), by = 'Departamentos') %>% 
  rename(Grupo_hclus = value) %>% 
  mutate(Grupo_hclus = as.integer(Grupo_hclus))


colors <- c('#4AC6B7', '#1972A4', '#965F8A', '#FF7070', '#C61951')


fig <- trans_data %>% 
  plot_ly(x = ~PC1, y = ~PC2, z = ~PC3, split = ~Grupo_hclus,
          colors = colors, name = ~Grupo_hclus, text = ~Departamentos, 
          hovertemplate = "%{text}<br>PC1: %{x}<br>PC2: %{y}<br>PC3: %{z}")
if (knitr::is_html_output()) {
  fig %>% 
    add_markers()
}

PCA3d

trans_data1 <- data %>% 
  left_join(funClusters_2(clean_data, 5)$tree %>% 
              as_tibble(rownames = 'Departamentos'), 
            by = c('Departamento...2' = 'Departamentos')) %>% 
  rename(Grupo_hclus = value, Departamentos = Departamento...2) %>% 
  mutate(Grupo_hclus = as.integer(Grupo_hclus))

fig1 <- trans_data1 %>% 
  plot_ly(x = ~No_Inscritos, y = ~`Cumplimiento_A2_2020-2021-06`, 
          z = ~PropPortafolio, split = ~Grupo_hclus,
          colors = colors, name = ~Grupo_hclus, text = ~Departamentos, 
          hovertemplate = "%{text}<br>N.° inscritos: %{x}<br>Cumplimiento A2: %{y}<br>Prop. portafolio: %{z}")
if (knitr::is_html_output()) {
  fig1 %>% 
    add_markers()
}
## Warning: Ignoring 2 observations

PCA3d-1